# Setup - Run only once per Kernel App%conda install https://anaconda.org/conda-forge/openjdk/11.0.1/download/linux-64/openjdk-11.0.1-hacce0ff_1021.tar.bz2# install PySpark%pip install pyspark==3.4.0# restart kernelfrom IPython.core.display import HTMLHTML("<script>Jupyter.notebook.kernel.restart()</script>")
Retrieving notices: ...working... done
Downloading and Extracting Packages:
## Package Plan ##
environment location: /opt/conda
added / updated specs:
- conda-forge/openjdk/11.0.1/download/linux-64::openjdk==11.0.1=hacce0ff_1021
The following NEW packages will be INSTALLED:
openjdk conda-forge/openjdk/11.0.1/download/linux-64::openjdk-11.0.1-hacce0ff_1021
Downloading and Extracting Packages:
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Note: you may need to restart the kernel to use updated packages.
Collecting pyspark==3.4.0
Using cached pyspark-3.4.0-py2.py3-none-any.whl
Requirement already satisfied: py4j==0.10.9.7 in /opt/conda/lib/python3.11/site-packages (from pyspark==3.4.0) (0.10.9.7)
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Note: you may need to restart the kernel to use updated packages.
Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider
Ivy Default Cache set to: /home/sagemaker-user/.ivy2/cache
The jars for the packages stored in: /home/sagemaker-user/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b02b0b31-b2cc-4ff4-a0ef-5bb8148d9c77;1.0
confs: [default]
found org.apache.hadoop#hadoop-aws;3.2.2 in central
found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 190ms :: artifacts dl 6ms
:: modules in use:
com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 2 | 0 | 0 | 0 || 2 | 0 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-b02b0b31-b2cc-4ff4-a0ef-5bb8148d9c77
confs: [default]
0 artifacts copied, 2 already retrieved (0kB/5ms)
24/12/13 02:15:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
%%timeimport sagemakerfrom sagemaker.spark.processing import PySparkProcessor# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.role = sagemaker.get_execution_role()spark_processor = PySparkProcessor( base_job_name="sm-spark-project", framework_version="3.3", role=role, instance_count=4, instance_type="ml.m5.xlarge", max_runtime_in_seconds=3600,)# s3 pathssession = sagemaker.Session()bucket = session.default_bucket()s3_dataset_path_commments ="s3://bigdatateaching/reddit-project/reddit/parquet/comments/yyyy=*/mm=*/*.parquet"s3_dataset_path_submissions ="s3://bigdatateaching/reddit-project/reddit/parquet/submissions/yyyy=*/mm=*/*.parquet"output_prefix_data ="project"output_prefix_logs =f"spark_logs"# modify this comma separated list to choose the subreddits of interest#subreddits = "technology,chatgpt"subreddits ="investing,cryptocurrency"
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
CPU times: user 2.51 s, sys: 325 ms, total: 2.84 s
Wall time: 4.14 s
%%times3_path =f"s3a://{bucket}/{output_prefix_data}/comments"print(f"reading submissions from {s3_path}")comments = spark.read.parquet(s3_path, header=True)print(f"shape of the comments dataframe is {comments.count():,}x{len(comments.columns)}")
reading submissions from s3a://sagemaker-us-east-1-381491950264/project/comments
shape of the comments dataframe is 3,877,393x17
CPU times: user 103 ms, sys: 50.1 ms, total: 153 ms
Wall time: 3min 58s
# display a subset of columnscomments.select("subreddit", "author", "body", "parent_id", "link_id", "id", "created_utc").show()
+--------------+--------------------+--------------------+----------+----------+-------+-----------+
| subreddit| author| body| parent_id| link_id| id|created_utc|
+--------------+--------------------+--------------------+----------+----------+-------+-----------+
|CryptoCurrency| TexasBoyz-713|Dude I wouldn’t s...|t1_jsjxgdf|t3_153et67|jsjxjpu| 1689744125|
|CryptoCurrency| bthemonarch|Moons are the rea...|t1_jsjxcqm|t3_153et67|jsjxjuv| 1689744128|
|CryptoCurrency| keithwee0909|Should I buy some...|t1_jsjxfk8|t3_153et67|jsjxk0f| 1689744130|
|CryptoCurrency| [deleted]| [removed]|t3_153et67|t3_153et67|jsjxk5v| 1689744133|
|CryptoCurrency| Traveler-0854|¢0.63\n\nWe are o...|t3_153et67|t3_153et67|jsjxk5x| 1689744133|
|CryptoCurrency| Illicitterror| Prime Trustusbro|t3_153axby|t3_153axby|jsjxk66| 1689744133|
|CryptoCurrency| vip887|Yup, a tiny mista...|t3_153ky0e|t3_153ky0e|jsjxk85| 1689744134|
|CryptoCurrency| chintokkong|New Yorkers will ...|t3_153et67|t3_153et67|jsjxkfo| 1689744137|
|CryptoCurrency| AutoModerator|\nHere is a [Nitt...|t1_jsjxkfo|t3_153et67|jsjxkgp| 1689744138|
|CryptoCurrency| ziggyzago|I’m having crazy ...|t3_153et67|t3_153et67|jsjxkk4| 1689744140|
|CryptoCurrency| R4ID|with how fast it ...|t3_153lm7j|t3_153lm7j|jsjxkkr| 1689744140|
|CryptoCurrency| RedBunery|> Cathie Wood ass...|t3_153ip93|t3_153ip93|jsjxks6| 1689744143|
|CryptoCurrency| ccModBot|Hello!\n\nDirect ...|t3_153lm7j|t3_153lm7j|jsjxl1m| 1689744148|
|CryptoCurrency|CoolCoolPapaOldSkool|Yellow for me for...|t3_153lm7j|t3_153lm7j|jsjxl24| 1689744148|
|CryptoCurrency| Agile_Ad_7061|No. Prince from N...|t3_153ky0e|t3_153ky0e|jsjxl4z| 1689744150|
|CryptoCurrency| Frogmangy|Literally know a ...|t1_jsjvuyx|t3_153ky0e|jsjxl5i| 1689744150|
|CryptoCurrency| ImNotFromThisWorld|Man I should have...|t3_153et67|t3_153et67|jsjxlx6| 1689744163|
|CryptoCurrency| send_tacoz|Aren't Doge price...|t3_153krw1|t3_153krw1|jsjxmgi| 1689744172|
|CryptoCurrency| OddIndication4|Ask yourself the ...|t3_1539jks|t3_1539jks|jsjxmh7| 1689744172|
|CryptoCurrency| chili21| !gas nova|t3_12pf9pb|t3_12pf9pb|jsjxmpc| 1689744176|
+--------------+--------------------+--------------------+----------+----------+-------+-----------+
only showing top 20 rows
%%times3_path =f"s3a://{bucket}/{output_prefix_data}/submissions"print(f"reading submissions from {s3_path}")submissions = spark.read.parquet(s3_path, header=True)print(f"shape of the submissions dataframe is {submissions.count():,}x{len(submissions.columns)}")
reading submissions from s3a://sagemaker-us-east-1-381491950264/project/submissions
shape of the submissions dataframe is 205,720x21
CPU times: user 12.9 ms, sys: 11.8 ms, total: 24.7 ms
Wall time: 38.8 s
# display a subset of columnssubmissions.select("subreddit", "author", "title", "selftext", "num_comments", "created_utc").show()
+--------------+-------------------+--------------------+--------------------+------------+-----------+
| subreddit| author| title| selftext|num_comments|created_utc|
+--------------+-------------------+--------------------+--------------------+------------+-----------+
| investing| WichitaFlyer|Will negative pop...|Millennial and ge...| 0| 1689714390|
|CryptoCurrency| V11c7or| AITrader| [removed]| 0| 1689714454|
| investing| Ok_Supermarket9812|Tools to assess f...| [removed]| 1| 1689714522|
|CryptoCurrency| charlythesecond|Terraform Labs Co...| | 0| 1689714532|
|CryptoCurrency| badfishbeefcake|Scam Alert: Celsi...| [removed]| 2| 1689714536|
|CryptoCurrency| Fun-Juggernautyy|Opinions on Hatom...| [removed]| 1| 1689714660|
|CryptoCurrency| Fun-Juggernautyy|Opinions on Hatom...| [removed]| 1| 1689714664|
|CryptoCurrency| OneThatNoseOne|Today, many peopl...|Safe to say that ...| 0| 1689714869|
| investing| Ok_Supermarket9812|Tool to assess fu...|I am looking for ...| 0| 1689714939|
|CryptoCurrency| [deleted]|SUI, its hype and...| [removed]| 1| 1689715044|
|CryptoCurrency| garchmodel|if arkham was rea...| [removed]| 1| 1689715090|
|CryptoCurrency| Fun_Problem_914|hola mrbeast no m...| [removed]| 1| 1689715093|
|CryptoCurrency| syndoms18|SUI, its hype and...| [removed]| 1| 1689715103|
|CryptoCurrency| RassuEst112|Are There Any Non...| [removed]| 1| 1689715260|
| investing| StophJS|Your ETF portfoli...|This may be flagg...| 0| 1689715325|
|CryptoCurrency| DrHunterThompson| MOONS Mass Adoption| [removed]| 1| 1689715452|
|CryptoCurrency| bdiggles|Where can U.S. tr...| [removed]| 1| 1689715500|
|CryptoCurrency| Oneiros1999|Prime Trust Put I...| | 0| 1689715584|
|CryptoCurrency|Artorias_the_hollow|I bought some Moo...| [removed]| 1| 1689715619|
|CryptoCurrency| Nearby_Market3054|Bitcoin mining an...| [removed]| 1| 1689715639|
+--------------+-------------------+--------------------+--------------------+------------+-----------+
only showing top 20 rows
Submissions data overview and quality check
%%times3_path =f"s3a://{bucket}/{output_prefix_data}/submissions"print(f"reading submissions from {s3_path}")submissions = spark.read.parquet(s3_path, header=True)print(f"shape of the submissions dataframe is {submissions.count():,}x{len(submissions.columns)}")
reading submissions from s3a://sagemaker-us-east-1-381491950264/project/submissions
shape of the submissions dataframe is 205,720x21
CPU times: user 16.3 ms, sys: 2.75 ms, total: 19 ms
Wall time: 26.5 s
+-------------------------+------------+
|title |title_length|
+-------------------------+------------+
|Investing lesson for kids|25 |
|Captive Insurance Program|25 |
|Yieldmax Has Cult Losers |25 |
|Investing 150K in Europe?|25 |
|Simple Wash Sell Question|25 |
|Where to start to end 9-5|25 |
|Working to setup taxable |25 |
|$500 TO INVEST PER MONTH!|25 |
|Sell ESPP to max out ROTH|25 |
|Richtech Robotics class B|25 |
|What happened to biotech?|25 |
|401k trading and BTC ETFs|25 |
|What am I actually doing?|25 |
|Wealthsimple and Fidelity|25 |
|Needing investment advice|25 |
|What should I do with 5k?|25 |
|Honest question of "Why?"|25 |
|Bonds for 60/40 Portfolio|25 |
|Investing trends question|25 |
|Strategy for buying stock|25 |
+-------------------------+------------+
print(f"shape of the investing submissions dataframe is {filtered_invest_sub.count():,}x{len(filtered_invest_sub.columns)}")
%%times3_path =f"s3a://{bucket}/{output_prefix_data}/comments"print(f"reading submissions from {s3_path}")comments = spark.read.parquet(s3_path, header=True)print(f"shape of the comments dataframe is {comments.count():,}x{len(comments.columns)}")
reading submissions from s3a://sagemaker-us-east-1-381491950264/project/comments
shape of the comments dataframe is 3,877,393x17
CPU times: user 109 ms, sys: 25.9 ms, total: 135 ms
Wall time: 3min 19s
from pyspark.sql.functions import col, count, when, isnanfrom pyspark.sql.types import DoubleType, FloatType# Calculate null counts for each columncomments_null_counts = invest_comments.select([ count(when(col(c).isNull() | (col(c).cast(DoubleType()).isNotNull() & isnan(col(c))), c)).alias(c) if invest_comments.schema[c].dataType in [DoubleType(), FloatType()]else count(when(col(c).isNull(), c)).alias(c)for c in invest_comments.columns])comments_null_counts.show()
comments_removed_count = invest_comments.filter(invest_comments.body =="[removed]").count()print(f"Number of '[removed]' in body: {comments_removed_count}")
shape of the invest_comments dataframe is 508,050x17
EDA Questions and Analysis
Business goal 1
Determining the most common words/topics on investing on sub reddit.
Technical proposal
from pyspark.sql.functions import col, lower, regexp_replace, explode, splitfrom pyspark.ml.feature import StopWordsRemover# Step 1: Text Preprocessing# Convert 'body' text to lowercase and remove punctuationinvest_comments_clean = invest_comments.withColumn("cleaned_body", lower(regexp_replace(col("body"), "[^a-zA-Z\\s]", "")))# Split text into wordsinvest_comments_words = invest_comments_clean.withColumn("words", split(col("cleaned_body"), "\\s+"))# Remove stop words using Spark's StopWordsRemoverremover = StopWordsRemover(inputCol="words", outputCol="filtered_words")invest_comments_filtered = remover.transform(invest_comments_words)# Step 2: Word Frequency Analysis# Explode the 'filtered_words' column to count word occurrenceswords_df = invest_comments_filtered.select(explode(col("filtered_words")).alias("word"))# Filter out any empty strings that might appearwords_df = words_df.filter(col("word") !="")# Count the frequency of each wordword_counts = words_df.groupBy("word").count().orderBy(col("count").desc())# Show the top 20 most common wordsword_counts.show(20)
import matplotlib.pyplot as plt# Converting the Spark DataFrame to Pandastop_words = word_counts.limit(20).toPandas()# Plot the top 20 most common wordsplt.figure(figsize=(12, 6))plt.bar(top_words['word'], top_words['count'], alpha=0.7)plt.title('Top 20 Most Common Words in Investing Subreddit', fontsize=16)plt.xlabel('Words', fontsize=14)plt.ylabel('Frequency of words (in thousands)', fontsize=14)plt.xticks(rotation=45, ha='right', fontsize=12)plt.tight_layout()plt.show()
Business goal 2
Identify trending topics and key investment terms in Reddit discussions to inform content strategy and product development for financial services.
Technical proposal
!pip install wordcloud
Collecting wordcloud
Using cached wordcloud-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.11/site-packages (from wordcloud) (1.26.4)
Requirement already satisfied: pillow in /opt/conda/lib/python3.11/site-packages (from wordcloud) (10.4.0)
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.11/site-packages (from wordcloud) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (24.1)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (3.2.0)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.11/site-packages (from matplotlib->wordcloud) (2.9.0)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
Using cached wordcloud-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (547 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4
from wordcloud import WordCloudimport refrom nltk.corpus import stopwordsimport matplotlib.pyplot as pltimport nltk# Download stopwords if not already availablenltk.download('stopwords')# Preprocess text and create a single text corpus from the 'body' column in invest_commentstext =" ".join(comment for comment in invest_comments.select('body').rdd.flatMap(lambda x: x).collect())text = re.sub(r'[^\w\s]', '', text) # Remove punctuation# Define and remove common stopwordsstop_words =set(stopwords.words('english'))wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words, colormap='viridis').generate(text)# Plot the word cloudplt.figure(figsize=(10, 5))plt.imshow(wordcloud, interpolation='bilinear')plt.axis('off')plt.title('Popular Terms in Investing Subreddit Comments')plt.show()plt.savefig('wordcloud.png')
[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data] user/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
<Figure size 640x480 with 0 Axes>
Business goal 3
Determine whether certain events lead to increased discussion volume.
Technical proposal
!pip install seaborn
Collecting seaborn
Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /opt/conda/lib/python3.11/site-packages (from seaborn) (1.26.4)
Requirement already satisfied: pandas>=1.2 in /opt/conda/lib/python3.11/site-packages (from seaborn) (2.2.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /opt/conda/lib/python3.11/site-packages (from seaborn) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)
Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.0)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0)
Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.2->seaborn) (2023.3)
Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
from pyspark.sql.functions import from_unixtime, weekofyear, hour, avgimport matplotlib.pyplot as pltimport seaborn as snsimport pandas as pd# Convert created_utc to datetime and extract week_of_the_year and hour_of_the_dayinvest_comments = invest_comments.withColumn("datetime", from_unixtime(invest_comments.created_utc))invest_comments = invest_comments.withColumn("week_of_the_year", weekofyear("datetime"))invest_comments = invest_comments.withColumn("hour_of_the_day", hour("datetime"))# Group by week_of_the_year and hour_of_the_day and calculate average scoreengagement_data = invest_comments.groupBy("week_of_the_year", "hour_of_the_day").agg(avg("score").alias("average_score"))# Convert to Pandas for visualizationengagement_pd = engagement_data.toPandas()# Pivot the data for heatmapheatmap_data = engagement_pd.pivot(index="week_of_the_year", columns="hour_of_the_day", values="average_score")# Basic plot for checking dataplt.figure(figsize=(15, 10))sns.heatmap(heatmap_data, cmap='viridis', annot=False, fmt='.2f', linewidths=0.5, cbar=True)plt.title('Average Number of Comments in Investing subreddit per Hour and Week of the Year', fontsize=16)plt.xlabel('Hour of the Day (in hours)', fontsize=14)plt.ylabel('Week of the Year (in weeks)', fontsize=14)plt.xticks(fontsize=12)plt.yticks(fontsize=12)plt.tight_layout() # Ensure everything fits nicelyplt.show()
Business goal 4
Determine which stocks are most frequently mentioned on Reddit to gauge general interest and popularity.
Technical proposal
# Extracting Stock Tickers from Reddit Data from pyspark.sql.functions import regexp_extract, col# Defining a regular expression to match stock tickers (e.g., $AAPL)# Adjust regex if needed for more accurate ticker detectionticker_regex =r"\b[A-Z]{1,5}\b"# Extracting possible tickers from the 'body' columnpotential_tickers = invest_comments.withColumn("ticker", regexp_extract(col("body"), ticker_regex, 0))# Filtering non-empty tickersvalid_tickers = potential_tickers.filter(col("ticker") !="")# Counting mentions of each tickerticker_counts = valid_tickers.groupBy("ticker").count().orderBy(col("count").desc())# Showing the most mentioned tickersticker_counts.show(20)
Collecting yfinance
Using cached yfinance-0.2.50-py2.py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: pandas>=1.3.0 in /opt/conda/lib/python3.11/site-packages (from yfinance) (2.2.3)
Requirement already satisfied: numpy>=1.16.5 in /opt/conda/lib/python3.11/site-packages (from yfinance) (1.26.4)
Requirement already satisfied: requests>=2.31 in /opt/conda/lib/python3.11/site-packages (from yfinance) (2.32.3)
Collecting multitasking>=0.0.7 (from yfinance)
Using cached multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: lxml>=4.9.1 in /opt/conda/lib/python3.11/site-packages (from yfinance) (5.3.0)
Requirement already satisfied: platformdirs>=2.0.0 in /opt/conda/lib/python3.11/site-packages (from yfinance) (3.11.0)
Requirement already satisfied: pytz>=2022.5 in /opt/conda/lib/python3.11/site-packages (from yfinance) (2023.3)
Requirement already satisfied: frozendict>=2.3.4 in /opt/conda/lib/python3.11/site-packages (from yfinance) (2.4.6)
Collecting peewee>=3.16.2 (from yfinance)
Using cached peewee-3.17.8-cp311-cp311-linux_x86_64.whl
Requirement already satisfied: beautifulsoup4>=4.11.1 in /opt/conda/lib/python3.11/site-packages (from yfinance) (4.12.3)
Collecting html5lib>=1.1 (from yfinance)
Using cached html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.11/site-packages (from beautifulsoup4>=4.11.1->yfinance) (2.5)
Requirement already satisfied: six>=1.9 in /opt/conda/lib/python3.11/site-packages (from html5lib>=1.1->yfinance) (1.16.0)
Requirement already satisfied: webencodings in /opt/conda/lib/python3.11/site-packages (from html5lib>=1.1->yfinance) (0.5.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.3.0->yfinance) (2.9.0)
Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.11/site-packages (from pandas>=1.3.0->yfinance) (2024.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.11/site-packages (from requests>=2.31->yfinance) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.11/site-packages (from requests>=2.31->yfinance) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.11/site-packages (from requests>=2.31->yfinance) (1.26.19)
Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.11/site-packages (from requests>=2.31->yfinance) (2024.8.30)
Using cached yfinance-0.2.50-py2.py3-none-any.whl (102 kB)
Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB)
Using cached multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Installing collected packages: peewee, multitasking, html5lib, yfinance
Successfully installed html5lib-1.1 multitasking-0.0.11 peewee-3.17.8 yfinance-0.2.50
import pandas as pdimport plotly.express as pximport yfinance as yffrom pyspark.sql.functions import col, regexp_extract, count# Step 1: Extract Stock Tickers and Mentions from Reddit Datadef extract_ticker_mentions(reddit_df, ticker_regex):""" Extract and count stock tickers mentioned in Reddit posts. :param reddit_df: PySpark DataFrame with Reddit comments/submissions :param ticker_regex: Regex pattern to identify stock tickers (e.g., capital letters 1-5 characters) :return: Pandas DataFrame with tickers and their mention counts """ tickers_df = ( reddit_df .withColumn('ticker', regexp_extract(col('body'), ticker_regex, 0)) .filter(col('ticker') !="") .groupBy('ticker') .agg(count('*').alias('mention_count')) .orderBy(col('mention_count').desc()) .limit(50) # Top 50 tickers .toPandas() )return tickers_dfticker_regex =r'\b[A-Z]{1,5}\b'reddit_df = spark.read.parquet(s3_path)top_tickers = extract_ticker_mentions(reddit_df, ticker_regex)
# Step 2: Fetch Market Data with Company Names from Yahoo Financedef fetch_market_data_yahoo(tickers):""" Fetch market data including company names, market capitalization, and outstanding shares. :param tickers: List of stock tickers. :return: DataFrame with ticker, company name, market_cap, and trade_count. """ market_data = []for ticker in tickers:try: stock = yf.Ticker(ticker) info = stock.info market_data.append({"ticker": ticker,"company_name": info.get("shortName", ticker), # Use ticker if name is unavailable"market_cap": info.get("marketCap", 0), # Market Capitalization"trade_count": info.get("sharesOutstanding", 0) # Outstanding Shares })exceptExceptionas e:print(f"Error fetching data for {ticker}: {e}")return pd.DataFrame(market_data)market_data = fetch_market_data_yahoo(top_tickers['ticker'].tolist())# Step 3: Merge Datamerged_data = pd.merge(top_tickers, market_data, on='ticker', how='inner')# Remove invalid or zero valuesmerged_data = merged_data[ (merged_data['market_cap'] >0) & (merged_data['trade_count'] >0) & (merged_data['mention_count'] >0)]def create_interactive_bubble_plot(data, save_as="html"):""" Create an interactive bubble plot without a dropdown and save it. :param data: Pandas DataFrame with 'mention_count', 'market_cap', 'trade_count', and 'company_name' :param save_as: Format to save the plot ('png' or 'html') """# Combine ticker and company name for display data['label'] = data['ticker'] +" - "+ data['company_name']# Create the figure fig = px.scatter( data, x='mention_count', y='market_cap', size='trade_count', color='ticker', hover_name='label', # Show company name and ticker in hover size_max=60, labels={'mention_count': 'Mention Count (in thousands)','market_cap': 'Market Capitalization in Billions (USD)','trade_count': 'Trade Volume' }, title="Stock Popularity vs Market Cap and Trade Volume", )# Save the plotif save_as =="png": fig.write_image("bubble_plot.png")print("Plot saved as 'bubble_plot.png'")elif save_as =="html": fig.write_html("bubble_plot.html")print("Plot saved as 'bubble_plot.html'") fig.show()# Example usagecreate_interactive_bubble_plot(merged_data, save_as="html")
ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/DYOR?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=DYOR&crumb=CuJz0cgaUfS
ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/USDT?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=USDT&crumb=CuJz0cgaUfS
ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/MATIC?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=MATIC&crumb=CuJz0cgaUfS
Comments data overview and quality check